Most people attempting to model the distribution of a rare species do not necessarily have 1200 sites worth of data. They often are focused on a smaller region. Additionally, using more than 1000 sites is not recommended for the full spatial joint HMSC model (In that case, either a GPP or NNGP are suggested as alternatives, but they present their own unique complications as either the number of neighbors or the number of knots to use have to be chosen in some fashion).
Thus, to address both of these issues, we decided to reduce the sites to a subset of 400-600 sites (as Norberg et al. did).
Here we choose the 600 most Southern sites (or the 400). Determining whether to use 400-600 will depend on figuring out whether only 400 sites allows the choice of \(\mu_{ext}\) to be far enough away from \(\mu_{avg}\) while still leaving enough sites that are within 1 standard deviation of \(\mu_{ext}\).
us <- map_data('state')
ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
geom_point(data=south, aes(x=long, y=lat), colour="red") +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))
colSums(south[,6:67])
## sp1 sp2 sp3 sp4 sp5 sp6 sp7 sp8 sp9 sp10 sp11 sp12 sp13 sp14 sp15 sp16
## 41 229 24 22 56 20 10 64 18 55 21 16 200 10 24 44
## sp17 sp18 sp19 sp20 sp21 sp22 sp23 sp24 sp25 sp26 sp27 sp28 sp29 sp30 sp31 sp32
## 18 71 12 4 10 43 12 16 28 15 70 11 121 21 25 21
## sp33 sp34 sp35 sp36 sp37 sp38 sp39 sp40 sp41 sp42 sp43 sp44 sp45 sp46 sp47 sp48
## 105 51 19 31 12 88 42 17 27 13 26 31 14 32 20 4
## sp49 sp50 sp51 sp52 sp53 sp54 sp55 sp56 sp57 sp58 sp59 sp60 sp61 sp62
## 8 31 57 22 124 17 1 50 17 8 11 11 56 20
cols <- brewer.pal(3, "Dark2")
temp <- dat
for(i in 1:1200){
temp$N[i] <- sum(temp[i,6:68])
}
ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
geom_point(data=temp, aes(x=long, y=lat, color=N, size=N),shape=19, alpha=0.8) + scale_color_viridis() +
scale_size(range=c(0.01,3)) +
coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -65), ylim=c(25, 50))
for (i in 1:63) {
temp_south <- south[, c(1:5, i + 5)]
temp_north <- north[, c(1:5, i + 5)]
names(temp_south) <- c("long", "lat", "PC1", "PC2", "PC3", "SP")
names(temp_north) <- c("long", "lat", "PC1", "PC2", "PC3", "SP")
p <- ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
geom_point(data=temp_south[temp_south$SP==0,], aes(x=long, y=lat), colour=cols[1], shape=4, alpha=0.2, size=0.1) +
geom_point(data=temp_south[temp_south$SP==1,], aes(x=long, y=lat), colour=cols[1], shape=19, alpha=1, size=1) +
#North
geom_point(data=temp_north[temp_north$SP==0,], aes(x=long, y=lat), colour=cols[2], shape=4, alpha=0.2, size=0.1) +
geom_point(data=temp_north[temp_north$SP==1,], aes(x=long, y=lat), colour=cols[2], shape=19, alpha=1, size=1) +
ggtitle(paste0("Species: ", i)) +
coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -65), ylim=c(25, 50))
plot(p)
}
#PC1 x PC2
ggplot() +
geom_point(data=south, aes(x=PC1, y=PC2), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
geom_point(data=north, aes(x=PC1, y=PC2),
color=cols[2], shape=16, size=2, alpha=0.25)+
xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC2]))
#PC1 x PC3
ggplot() +
geom_point(data=south, aes(x=PC1, y=PC3), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
geom_point(data=north, aes(x=PC1, y=PC3),
color=cols[2], shape=16, size=2, alpha=0.25)+
xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC3]))
#PC2 x PC3
ggplot() +
geom_point(data=south, aes(x=PC2, y=PC3), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
geom_point(data=north, aes(x=PC2, y=PC3),
color=cols[2], shape=16, size=2, alpha=0.25)+
xlab(bquote(mu[PC2])) + ylab(bquote(mu[PC3]))
X_bar <- data.frame(mu_PC1 = rep(0, 64),
sd_PC1 = rep(0, 64),
mu_PC2 = rep(0, 64),
sd_PC2 = rep(0, 64),
mu_PC3 = rep(0, 64),
sd_PC3 = rep(0, 64),
N = rep(NA, 64))
rownames(X_bar) <- c(paste0("sp", 1:63), "Mean")
for (j in 1:63) {
temp <-subset(south, south[,j+5]==1)
X_bar$mu_PC1[j] <- mean(temp$PC1) #mu_PC1
X_bar$sd_PC1[j] <- sd(temp$PC1) #sd_PC1
X_bar$min_PC1[j] <- min(temp$PC1)
X_bar$max_PC1[j] <- max(temp$PC1)
X_bar$mu_PC2[j] <- mean(temp$PC2) #mu_PC2
X_bar$sd_PC2[j] <- sd(temp$PC2) #sd_PC2
X_bar$min_PC2[j] <- min(temp$PC2)
X_bar$max_PC2[j] <- max(temp$PC2)
X_bar$mu_PC3[j] <- mean(temp$PC3) #mu_PC3
X_bar$sd_PC3[j] <- sd(temp$PC3) #sd_PC3
X_bar$min_PC3[j] <- min(temp$PC3)
X_bar$max_PC3[j] <- max(temp$PC3)
X_bar$N[j] <- length(temp$PC1)
}
X_bar$mu_PC1[64] <- mean(X_bar$mu_PC1[1:63])
X_bar$sd_PC1[64] <-mean(X_bar$sd_PC1[1:63], na.rm=T)
X_bar$mu_PC2[64] <- mean(X_bar$mu_PC2[1:63])
X_bar$sd_PC2[64] <- mean(X_bar$sd_PC2[1:63], na.rm=T)
X_bar$mu_PC3[64] <- mean(X_bar$mu_PC3[1:63])
X_bar$sd_PC3[64] <- mean(X_bar$sd_PC3[1:63], na.rm=T)
#PC1 vs PC2
ggplot() +
geom_point(data=south, aes(x=PC1, y=PC2, color=PC3), shape=16, size=2, alpha=0.25, show.legend=F) +
scale_color_continuous(type="viridis", limits=c(-10,3))+
geom_point(data=X_bar[1:63,], aes(x=mu_PC1, y=mu_PC2,fill=mu_PC3), shape=21, size=2) +
geom_point(data=X_bar[64,], aes(x=mu_PC1, y=mu_PC2, fill=mu_PC3), shape=21, size=5) +
scale_fill_continuous(type="viridis", limits=c(-10,3)) +
xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC2])) +
labs(fill=bquote(mu[PC3]))
#PC1 vs PC3
ggplot() +
geom_point(data=south, aes(x=PC1, y=PC3, color=PC2), shape=16, size=2, alpha=0.25, show.legend=F) +
scale_color_continuous(type="viridis", limits=c(-9,16))+
geom_point(data=X_bar[1:63,], aes(x=mu_PC1, y=mu_PC3,fill=mu_PC2), shape=21, size=2) +
geom_point(data=X_bar[64,], aes(x=mu_PC1, y=mu_PC3, fill=mu_PC2), shape=21, size=5) +
scale_fill_continuous(type="viridis", limits=c(-9,16)) +
xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC3])) +
labs(fill=bquote(mu[PC2]))
#PC2 vs PC3
ggplot() +
geom_point(data=south, aes(x=PC2, y=PC3, color=PC1), shape=16, size=2, alpha=0.25, show.legend=F) +
scale_color_continuous(type="viridis", limits=c(-9,13))+
geom_point(data=X_bar[1:63,], aes(x=mu_PC2, y=mu_PC3,fill=mu_PC1), shape=21, size=2) +
geom_point(data=X_bar[64,], aes(x=mu_PC2, y=mu_PC3, fill=mu_PC1), shape=21, size=5) +
scale_fill_continuous(type="viridis", limits=c(-9,13)) +
xlab(bquote(mu[PC2])) + ylab(bquote(mu[PC3])) +
labs(fill=bquote(mu[PC1]))
ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC1), shape=16) + ylab(bquote(sigma[PC1])) +
geom_hline(yintercept=quantile(X_bar$sd_PC1, probs=0.10, na.rm=T), linetype="dashed") +
geom_hline(yintercept=quantile(X_bar$sd_PC1, probs=0.90, na.rm=T), linetype="dashed") +
geom_hline(yintercept=X_bar$sd_PC1[64], linetype="dashed", col="red")
ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC2), shape=16) + ylab(bquote(sigma[PC2])) +
geom_hline(yintercept=quantile(X_bar$sd_PC2, probs=0.10, na.rm=T), linetype="dashed") +
geom_hline(yintercept=quantile(X_bar$sd_PC2, probs=0.9, na.rm=T), linetype="dashed") +
geom_hline(yintercept=X_bar$sd_PC2[64], linetype="dashed", col="red")
ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC3), shape=16) + ylab(bquote(sigma[PC3])) +
geom_hline(yintercept=quantile(X_bar$sd_PC3, probs=0.10, na.rm=T), linetype="dashed") +
geom_hline(yintercept=quantile(X_bar$sd_PC3, probs=0.90, na.rm=T), linetype="dashed") +
geom_hline(yintercept=X_bar$sd_PC3[64], linetype="dashed", col="red")
tries_PC1 <- c(seq(from=-9, to=-2.5, by=0.5), seq(from=-2, to=1, by=0.5))
tries_PC3 <- c(seq(from=-10, to=-1, by=0.5), seq(from=1, to=3, by=0.5))
tries_sigma_narrow_PC1 <- seq(from=0.1, to=3, by=0.1)
tries_sigma_narrow_PC3 <- seq(from=0.1, to=3, by=0.1)
tries <- expand.grid(tries_PC1, tries_PC3, tries_sigma_narrow_PC1, tries_sigma_narrow_PC3)
names(tries) <- c(
"mu_PC1", "mu_PC3",
"sd_PC1", "sd_PC3"
)
mu_avg <- c(X_bar$mu_PC1[64], X_bar$mu_PC3[64])
Sigma_wide <- diag(x=c(quantile(X_bar$sd_PC1, probs=0.90, na.rm=T)^2, quantile(X_bar$sd_PC3, probs=0.90, na.rm=T)^2))
df_tries <- data.frame(
mu_PC1 = tries$mu_PC1,
mu_PC3 = tries$mu_PC3,
sd_PC1 = tries$sd_PC1,
sd_PC3 = tries$sd_PC3,
p128_narrow_avg = rep(NA, length(tries$mu_PC1)),
p128_wide_ext = rep(NA, length(tries$mu_PC1)),
p128_narrow_ext = rep(NA,
length(tries$mu_PC1)))
for (ii in 1:length(df_tries$mu_PC1)) {
mu_ext <- c(df_tries$mu_PC1[ii], df_tries$mu_PC3[ii])
Sigma_narrow <- diag(c(df_tries$sd_PC1[ii], df_tries$sd_PC3[ii]))
L_narrow_avg <- dmvnorm(south[,c(3,5)],
mean=mu_avg,
sigma=Sigma_narrow )
L_narrow_avg <- L_narrow_avg/sum(L_narrow_avg)
L_narrow_avg <- sort(L_narrow_avg, decreasing=T)
L_narrow_ext <- dmvnorm(south[,c(3,5)],
mean=mu_ext,
sigma=Sigma_narrow )
L_narrow_ext <- L_narrow_ext/sum(L_narrow_ext)
L_narrow_ext <- sort(L_narrow_ext, decreasing=T)
L_wide_ext <- dmvnorm(south[,c(3,5)],
mean=mu_ext,
sigma=Sigma_wide )
L_wide_ext <- L_wide_ext/sum(L_wide_ext)
L_wide_ext <- sort(L_wide_ext, decreasing=T)
df_tries$p128_narrow_avg[ii] <- L_narrow_avg[128]/max(L_narrow_avg)
df_tries$p128_narrow_ext[ii] <- L_narrow_ext[128]/max(L_narrow_ext)
df_tries$p128_wide_ext[ii] <-L_wide_ext[128]/max(L_wide_ext)
if (ii %% 10000 == 0) { cat(paste0(ii, " of ", length(df_tries$mu_PC1), "\n"))
}
}
save(df_tries, file="../data/df_tries.RData")
load("../data/df_tries.RData")
df_tries2 <- subset(df_tries, df_tries$p128_narrow_ext>0.5)
df_tries3 <- subset(df_tries2, df_tries2$p128_wide_ext>0.5)
df_tries4 <- subset(df_tries3, df_tries3$p128_narrow_avg>0.5)
min(df_tries4$sd_PC1)
## [1] 0.7
df_tries5 <- subset(df_tries4, df_tries4$sd_PC1 <1)
min(df_tries5$sd_PC3)
## [1] 2.5
df_tries6 <- df_tries5[df_tries5$sd_PC3<2.6,]
df_tries7 <- subset(df_tries6, df_tries6$mu_PC1 > -4)
df_tries7[order(df_tries7$p128_narrow_ext, decreasing=F),]
## mu_PC1 mu_PC3 sd_PC1 sd_PC3 p128_narrow_avg p128_wide_ext
## 367323 -3.5 1.0 0.9 2.5 0.5025202 0.8216325
## 367365 -3.5 2.0 0.9 2.5 0.5025202 0.8472845
## 367344 -3.5 1.5 0.9 2.5 0.5025202 0.8615871
## p128_narrow_ext
## 367323 0.5081342
## 367365 0.5427180
## 367344 0.5461169
mu_ext <- c(-3.5, 2)
Sigma_narrow <- diag(c(0.9, 2.5))
mu_avg <- c(X_bar$mu_PC1[64], X_bar$mu_PC3[64])
Sigma_wide <- diag(x=c(quantile(X_bar$sd_PC1, probs=0.90, na.rm=T)^2, quantile(X_bar$sd_PC3, probs=0.90, na.rm=T)^2))
File structure: Model Type Species Type Sample Size - Each of the 30 replicates and 3 splits
There are 4 scenarios: narrow niche (narrow) vs. wide niche (wide), and community-average (avg) vs extreme (extreme).
cols <- viridis(4)
#PC1
ggplot(data = data.frame(x = c(-9, 13)), aes(x)) +
geom_histogram(data=south, aes(x=PC1, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
geom_histogram(data=X_bar[1:63,], aes(x=mu_PC1, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
stat_function(fun = dnorm, n = 101, args = list(mu_avg[1], sd = sqrt(Sigma_wide[1,1])), aes(color = "a")) +
stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[1], sd = sqrt(Sigma_narrow[1,1])), aes(color="b")) +
stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[1], sd = sqrt(Sigma_narrow[1,1]))) +
stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[1], sd = sqrt(Sigma_wide[1,1]))) +
ylab("Density") + xlab("PC1") +
scale_colour_manual(name = 'Case', guide='legend',
values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
scale_fill_manual(name = "Mean Value", guide='legend',
values=c("e"="grey", "f"="green"),
labels=c("Sites", "Species"))
# #PC2
# ggplot(data = data.frame(x = c(-9, 16)), aes(x)) +
# geom_histogram(data=dat, aes(x=PC2, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
# geom_histogram(data=X_bar[1:63,], aes(x=mu_PC2, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
# stat_function(fun = dnorm, n = 101, args = list(mu_avg[2], sd = sqrt(Sigma_wide[2,2])), aes(color = "a")) +
# stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[2], sd = sqrt(Sigma_narrow[2,2])), aes(color="b")) +
# stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_narrow[2,2]))) +
# stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_wide[2,2]))) +
# ylab("Density") + xlab("PC2") +
# scale_colour_manual(name = 'Case', guide='legend',
# values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
# scale_fill_manual(name = "Mean Value", guide='legend',
# values=c("e"="grey", "f"="green"),
# labels=c("Sites", "Species"))
#PC3
ggplot(data = data.frame(x = c(-11, 4)), aes(x)) +
geom_histogram(data=south, aes(x=PC3, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
geom_histogram(data=X_bar[1:63,], aes(x=mu_PC3, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
stat_function(fun = dnorm, n = 101, args = list(mu_avg[2], sd = sqrt(Sigma_wide[2,2])), aes(color = "a")) +
stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[2], sd = sqrt(Sigma_narrow[2,2])), aes(color="b")) +
stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_narrow[2,2]))) +
stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_wide[2,2]))) +
ylab("Density") + xlab("PC3") +
scale_colour_manual(name = 'Case', guide='legend',
values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
scale_fill_manual(name = "Mean Value", guide='legend',
values=c("e"="grey", "f"="green"),
labels=c("Sites", "Species"))
It is unlikely that species respond strongly to all three PCs. We can therefore ignore one of the PCs, for the sake of simplicity.
Here we will fix PC2 as irrelevant for determining habitat suitability for our simulated species, but we could have just as easily assumed one of the other two PCs was irrelevant.
#Case 1: Community-average, wide niche
x1 <- seq(-9, 2, length.out=100)
x3 <- seq(-10.5, 3.5, length.out=100)
my.palette <- brewer.pal(11, "RdYlBu")
df <- expand.grid(x1, x3)
names(df) <- c("PC1", "PC3")
df$L_wide_avg<- dmvnorm(x=df[,1:2], mean=mu_avg, sigma=Sigma_wide )
df$L_wide_ext<- dmvnorm(x=df[,1:2], mean=mu_ext, sigma=Sigma_wide )
df$L_narrow_avg<- dmvnorm(x=df[,1:2], mean=mu_avg, sigma=Sigma_narrow)
df$L_narrow_ext<- dmvnorm(x=df[,1:2], mean=mu_ext, sigma=Sigma_narrow)
#I. Wide Niche- Community Average
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) +
geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_wide_avg)) +
scale_fill_gradientn(colors=my.palette) +
theme(axis.text=element_text(size=15))+
theme(axis.title=element_text(size=15)) +
theme(title = element_text(size=15)) +
theme(plot.title=element_text(hjust=0.5)) +
geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
xlab("PC1") + ylab("PC3") +
ggtitle("I. Wide Niche - Community Average")
#II. Wide Niche - Extreme Value
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) +
geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_wide_ext)) +
scale_fill_gradientn(colors=my.palette) +
theme(axis.text=element_text(size=15))+
geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
theme(axis.title=element_text(size=15)) +
theme(title = element_text(size=15)) +
theme(plot.title=element_text(hjust=0.5)) +
xlab("PC1") + ylab("PC3") +
ggtitle("II. Wide Niche - Extreme Value")
#III. Narrow Niche - Community Average
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) +
geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_narrow_avg)) +
scale_fill_gradientn(colors=my.palette) +
geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
theme(axis.text=element_text(size=15))+
theme(axis.title=element_text(size=15)) +
theme(title = element_text(size=15)) +
theme(plot.title=element_text(hjust=0.5)) +
xlab("PC1") + ylab("PC3") +
ggtitle("III. Narrow Niche - Community Average")
#IV. Narrow Niche - Extreme Value
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) +
geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_narrow_ext)) +
scale_fill_gradientn(colors=my.palette) +
geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
theme(axis.text=element_text(size=15))+
theme(axis.title=element_text(size=15)) +
theme(title = element_text(size=15)) +
theme(plot.title=element_text(hjust=0.5)) +
xlab("PC1") + ylab("PC3") +
ggtitle("IV. Narrow Niche - Extreme Value")
Let \(N \in \{2, 4, 8, 16, 32, 64\}\) denote the desired number of presences to be used in a training model. Steps for generating simulated species:
Generate \(2 * N\) presences according to the habitat suitability function \(L_s\). Repeat for 30 replicates
For each replicate, split the data into two folds such that each fold has \(N\) presences. Repeat for 3 data-splits.
For N = 2, there are \({4 \choose 2} = 6\) ways of splitting the data, of which only 3 are unique: 12/34, 13/24, 14/23.
Check that species are well-modeled:
Use the HMSC cross-validation function to run 2-fold cross-validation for each of the three data-splits. This will result in 3 estimates of AUC, RMSE and Tjur’s \(R^2\) that are averaged over the two folds within the split. Each model will be trained on a data set that has \(N\) presences, and the model will be evaluated using a heldout portion of the data that also has \(N\) presences.
In order to evaluate the ability of the models to recover the response curves:
For each species \(s\), for each sample size \(n\) and for each replicate \(r\):
sims[[s]][[n]][[r]] contains the following objects:
index: stores the locations of presences. For each species x replicate x sample size combo, there should be \(2*n\) presences
Ysim: Unit vector of length 600 (one for each site within the study). A site has value 1 if it is occupied and 0 otherwise.
L: Unit vector of length 600 (one for each site within the study). The values are the habitat suitability calculated using the multivariate normal distribution for species \(s\)
split1, split2, split3: Three vectors, each of dimension 600 x1. A given split indicates which fold (1 or 2) each site is in.
set.seed(12)
NSites <- length(south$long)
row.names(south) <- 1:NSites
replicates <- paste0("rep", 1:30)
NPresences <- c(64, 32, 16, 8, 4, 2)
sizes <- paste0("size", NPresences)# Desired sample size
species <- c("wide_avg", "wide_ext",
"narrow_avg", "narrow_ext")
splits <- paste0("split", 1:3)
sims <- vector("list", length(species))
names(sims) <- species
for(s in 1:length(species)){
sims[[s]] <- vector("list", length(NPresences))
names(sims[[s]]) <- sizes
for(n in 1:length(NPresences)) {
sims[[s]][[n]] <- vector("list", length(replicates))
names(sims[[s]][[n]]) <- replicates
for(r in 1:length(replicates)) {
sims[[s]][[n]][[r]] <- vector("list", length(splits))
names(sims[[s]][[n]][[r]]) <- splits
}
}}
for (s in 1:length(species)) {
switch(species[s],
"wide_avg"={
sims[[s]]$L <-
dmvnorm(south[,c(3,5)],
mean=mu_avg,
sigma=Sigma_wide )
},
"wide_ext" = {
sims[[s]]$L <-
dmvnorm(south[,c(3,5)],
mean = mu_ext,
sigma=Sigma_wide )
},
"narrow_avg" = {
sims[[s]]$L <-
dmvnorm(south[,c(3,5)],
mean = mu_avg,
sigma=Sigma_narrow)
},
"narrow_ext" = {
sims[[s]]$L <-
dmvnorm(south[,c(3,5)],
mean = mu_ext,
sigma=Sigma_narrow)
})#end of switch
for(n in 1:length(sizes)) {
for(r in 1:length(replicates)){
sims[[s]][[n]][[r]]$index <- matrix(ncol=(2*NPresences[n]))
sims[[s]][[n]][[r]]$YSim <- matrix(0, nrow=NSites, ncol=1)
sims[[s]][[n]][[r]]$YSim <- data.frame(sims[[s]][[n]][[r]]$YSim )
names(sims[[s]][[n]][[r]]$YSim ) <- "SimSp"
sims[[s]][[n]][[r]]$index <- sample(1:NSites, 2*NPresences[n], replace=F, prob=sims[[s]]$L/sum(sims[[s]]$L))
for(site in 1:NSites) {
if(site %in% sims[[s]][[n]][[r]]$index) {
sims[[s]][[n]][[r]]$YSim[site,] <- 1
}
}
#Split 1 = 12/34
sites <- 1:NSites
presences1 <- sims[[s]][[n]][[r]]$index[1:NPresences[n]]
presences2 <-
sims[[s]][[n]][[r]]$index[seq(from=1, to=2*NPresences[n], by=2)]
thing <- rep(1:4, NPresences[n]/2)
presences3 <- sims[[s]][[n]][[r]]$index[thing==1 | thing==4]
NAbsences <- (NSites - 2*NPresences[n])/2
absences <- sites[! sites %in% sims[[s]][[n]][[r]]$index]
absences1 <- sample(absences, NAbsences, replace=F)
absences2 <- sample(absences, NAbsences, replace=F)
absences3 <- sample(absences, NAbsences, replace=F)
one <- c(presences1, absences1)
two <- c(presences2, absences2)
three <- c(presences3, absences3)
for (k in 1:NSites) {
if(k %in% one) {
sims[[s]][[n]][[r]]$split1[k] <- 1
}
else{sims[[s]][[n]][[r]]$split1[k] <- 2}
}
for (k in 1:NSites) {
if(k %in% two) {
sims[[s]][[n]][[r]]$split2[k] <- 1
}
else{sims[[s]][[n]][[r]]$split2[k] <- 2}
}
for (k in 1:NSites) {
if(k %in% three) {
sims[[s]][[n]][[r]]$split3[k] <- 1
}
else{sims[[s]][[n]][[r]]$split3[k] <- 2}
}
}
}
}
save(sims, file="../data/sims_data.RData")
Just pick one to look at, wide-average size 4 replicate 1:
load("../data/sims_data.RData")
replicates <- paste0("rep", 1:30)
NPresences <- c(64, 32, 16, 8, 4, 2)
sizes <- paste0("size", NPresences)# Desired sample size
species <- c("wide_avg", "wide_ext",
"narrow_avg", "narrow_ext")
splits <- paste0("split", 1:3)
row.names(south) <- 1:474
temp <- south
temp$YSim <- sims$wide_avg$size4$rep1$YSim
temp$YSim <- as.matrix(temp$YSim)
temp$YSim <- temp$YSim[,1]
temp$split1 <- sims$wide_avg$size4$rep1$split1
temp$split2 <- sims$wide_avg$size4$rep1$split2
temp$split3 <- sims$wide_avg$size4$rep1$split3
ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
#Group 1
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split1), shape=21) +
geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split1), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))
ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split2), shape=21) +
geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split2), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))
ggplot() +
geom_polygon(data=us, aes(x=long, y=lat,
group=group), colour="grey20", fill="grey80") +
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split3), shape=21) +
geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split3), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))
par(mfrow=c(1,2))
for ( s in 1:4){
for (n in 1:6){
for (r in 1:4) {
hist(sims[[s]]$L, breaks=100, main=paste0(species[s], sizes[n], replicates[r]))
abline(v=sims[[s]]$L[sims[[s]][[n]][[r]]$YSim==1], col="red")
}
}
}
+ For each sample-size x species category, make a histogram of the site IDs that are selected. Ideally there should be some variation here, in that each of the 30 replicates have different sites at which the species is present.
Note: For brevity, only the first 3 reps of the first 10 simulated species are shown